{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pprint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_corpus = [\n",
    "    \"Human machine interface for lab abc computer applications\",\n",
    "    \"A survey of user opinion of computer system response time\",\n",
    "    \"The EPS user interface management system\",\n",
    "    \"System and human system engineering testing of EPS\",\n",
    "    \"Relation of user perceived response time to error measurement\",\n",
    "    \"The generation of random binary unordered trees\",\n",
    "    \"The intersection graph of paths in trees\",\n",
    "    \"Graph minors IV Widths of trees and well quasi ordering\",\n",
    "    \"Graph minors A survey\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['human', 'interface', 'computer'],\n",
      " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
      " ['eps', 'user', 'interface', 'system'],\n",
      " ['system', 'human', 'system', 'eps'],\n",
      " ['user', 'response', 'time'],\n",
      " ['trees'],\n",
      " ['graph', 'trees'],\n",
      " ['graph', 'minors', 'trees'],\n",
      " ['graph', 'minors', 'survey']]\n"
     ]
    }
   ],
   "source": [
    "# Create a set of frequent words\n",
    "stoplist = set('for a of the and to in'.split(' ')) # 创建一个无序不重复元素集\n",
    "# Lowercase each document, split it by white space and filter out stopwords\n",
    "texts = [[word for word in document.lower().split() if word not in stoplist]\n",
    "         for document in text_corpus]\n",
    "\n",
    "# Count word frequencies\n",
    "from collections import defaultdict\n",
    "frequency = defaultdict(int)\n",
    "for text in texts: #每行\n",
    "    for token in text:\n",
    "        frequency[token] += 1\n",
    "\n",
    "# Only keep words that appear more than once\n",
    "processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]\n",
    "pprint.pprint(processed_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)\n"
     ]
    }
   ],
   "source": [
    "from gensim import corpora\n",
    "\n",
    "dictionary = corpora.Dictionary(processed_corpus)\n",
    "print(dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'computer': 0,\n",
      " 'eps': 8,\n",
      " 'graph': 10,\n",
      " 'human': 1,\n",
      " 'interface': 2,\n",
      " 'minors': 11,\n",
      " 'response': 3,\n",
      " 'survey': 4,\n",
      " 'system': 5,\n",
      " 'time': 6,\n",
      " 'trees': 9,\n",
      " 'user': 7}\n"
     ]
    }
   ],
   "source": [
    "pprint.pprint(dictionary.token2id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 1), (1, 1)]\n"
     ]
    }
   ],
   "source": [
    "new_doc = \"Human computer interaction\"\n",
    "new_vec = dictionary.doc2bow(new_doc.lower().split())\n",
    "print(new_vec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[(0, 1), (1, 1), (2, 1)],\n",
      " [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],\n",
      " [(2, 1), (5, 1), (7, 1), (8, 1)],\n",
      " [(1, 1), (5, 2), (8, 1)],\n",
      " [(3, 1), (6, 1), (7, 1)],\n",
      " [(9, 1)],\n",
      " [(9, 1), (10, 1)],\n",
      " [(9, 1), (10, 1), (11, 1)],\n",
      " [(4, 1), (10, 1), (11, 1)]]\n"
     ]
    }
   ],
   "source": [
    "bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]\n",
    "pprint.pprint(bow_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(5, 0.5898341626740045), (11, 0.8075244024440723)]\n"
     ]
    }
   ],
   "source": [
    "from gensim import models\n",
    "\n",
    "# train the model\n",
    "tfidf = models.TfidfModel(bow_corpus)\n",
    "\n",
    "# transform the \"system minors\" string\n",
    "words = \"system minors\".lower().split()\n",
    "print(tfidf[dictionary.doc2bow(words)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim import similarities\n",
    "\n",
    "index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]\n"
     ]
    }
   ],
   "source": [
    "query_document = 'system engineering'.split()\n",
    "query_bow = dictionary.doc2bow(query_document)\n",
    "sims = index[tfidf[query_bow]]\n",
    "print(list(enumerate(sims)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3 0.7184812\n",
      "2 0.41707572\n",
      "1 0.32448703\n",
      "0 0.0\n",
      "4 0.0\n",
      "5 0.0\n",
      "6 0.0\n",
      "7 0.0\n",
      "8 0.0\n"
     ]
    }
   ],
   "source": [
    "for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):\n",
    "    print(document_number, score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models import Word2Vec\n",
    "# sentences只需要是一个可迭代对象就可以\n",
    "sentences = [[\"cat\", \"say\", \"meow\"], [\"dog\", \"say\", \"woof\"]]\n",
    "model = Word2Vec(processed_corpus, min_count=1)  # 执行这一句的时候就是在训练模型了"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x000000000A8EFC88>\n",
      "human\n",
      "interface\n",
      "computer\n",
      "survey\n",
      "user\n",
      "system\n",
      "response\n",
      "time\n",
      "eps\n",
      "trees\n",
      "graph\n",
      "minors\n"
     ]
    }
   ],
   "source": [
    "for i, word in enumerate(model.wv.vocab):\n",
    "    print(word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'human'\t'computer'\t0.18\n",
      "'trees'\t'response'\t0.08\n",
      "'eps'\t'minors'\t0.04\n"
     ]
    }
   ],
   "source": [
    "pairs = [\n",
    "    ('human', 'computer'),   # a minivan is a kind of car\n",
    "    ('trees', 'response'),   # still a wheeled vehicle\n",
    "    ('eps', 'minors'),  # ok, no wheels, but still a vehicle\n",
    "]\n",
    "for w1, w2 in pairs:\n",
    "    print('%r\\t%r\\t%.2f' % (w1, w2,model.wv.similarity(w1, w2)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:root] *",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
